library(ggplot2)
library(caTools)
library(plotly)
library(MASS)
library(htmlwidgets)
n <- 1000 # Number of samples in each class
n_iter <- 100 # Number of iterations for logistic regression
auc_values <- numeric(n_iter) # Vector to store AUC values
sample_ratio <- 0.1 # Proportion of data to use in each logistic regression
Simulate data for diseased class and non-diseased class using a covariance matrix and mean vector using the MASS package. The covariance matrix is chosen such that the two predictors are correlated. The mean vector is chosen such that the diseased class has a higher mean than the non-diseased class for both predictors. The data is then combined into a single dataset.
# Set random seed for reproducibility
set.seed(123)
# Make covariance matrices for the two classes
cov_matrix <- matrix(c(1, 0.2, 0.2, 1), nrow = 2, ncol = 2)
# Make mean vectors for the two classes
mean_diseased <- c(1, 0.5) # Mean for predictor1 is twice that for predictor2
mean_non_diseased <- c(1.1, 0.55) # Mean for predictor1 is twice that for predictor2
# Generate data for diseased class
diseased_data <- mvrnorm(n, mu = mean_diseased, Sigma = cov_matrix)
# Generate data for non-diseased class
non_diseased_data <- mvrnorm(n, mu = mean_non_diseased, Sigma = cov_matrix)
# Combine into a single dataset
data <- data.frame(
predictor1 = c(diseased_data[, 1], non_diseased_data[, 1]),
predictor2 = c(diseased_data[, 2], non_diseased_data[, 2]),
class = c(rep(1, n), rep(0, n))
)
# Plot for Predictor 1
ggplot(data, aes(x = predictor1, fill = as.factor(class))) +
geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 1")
# Plot for Predictor 2
ggplot(data, aes(x = predictor2, fill = as.factor(class))) +
geom_histogram(alpha = 0.5, position = 'identity', bins = 30) +
ggtitle("Distribution of Diseased and Non-Diseased Classes for Predictor 2")
# Calculate densities for class 1 and class 0
density1 <- with(data[data$class == 1, ], kde2d(predictor1, predictor2, n = 30))
density0 <- with(data[data$class == 0, ], kde2d(predictor1, predictor2, n = 30))
# Create the interactive 3D plot
p1 <- plot_ly(z = ~density1$z) %>%
add_surface(
x = ~density1$x,
y = ~density1$y,
colorscale = list(c(0, "red"), list(1, "pink")),
opacity = 0.8
)
p2 <- plot_ly(z = ~density0$z) %>%
add_surface(
x = ~density0$x,
y = ~density0$y,
colorscale = list(c(0, "blue"), list(1, "lightblue")),
opacity = 0.8
)
p <- subplot(p1, p2, nrows = 1, shareX = TRUE, shareY = TRUE)
p
# Run logistic regression and calculate AUC for n_iter iterations
for(i in 1:n_iter) {
# Sample a subset of the data
sample_index <- sample(1:nrow(data), sample_ratio * nrow(data))
sample_data <- data[sample_index, ]
# Run logistic regression with two predictors
model <- glm(class ~ predictor1 + predictor2, data = sample_data, family = binomial)
# Predict on the same sample set
predictions <- predict(model, newdata = sample_data, type = "response")
# Calculate AUC
auc <- colAUC(predictions, sample_data$class, plotROC = FALSE)
auc_values[i] <- auc # Directly store the AUC value
# Save the model predictors and coefficients to calculate average coefficients
if(i == 1) {
coefficients <- model$coefficients
} else {
coefficients <- coefficients + model$coefficients
}
}
# Calculate average AUC
average_auc <- mean(auc_values)
print(paste("Average AUC: ", round(average_auc, 2)))
## [1] "Average AUC: 0.56"
# Calculate average coefficients
average_coefficients <- coefficients / n_iter
print(paste("Average coefficients: ", round(average_coefficients, 2)))
## [1] "Average coefficients: 0.12" "Average coefficients: -0.13"
## [3] "Average coefficients: 0.02"